# library preparations
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
import seaborn as sns
import time
Given that we could just ask the simulation for more samples, I decided to NOT employ a typical training-split. Ratther, I will deploy all samples for training and optimize based on cross validation approaches, given every test record a turn at being in the training set and test set. Once the model is fitted, we can then ask the simulation for some more samples (say 1000) to use as a test set completely related here.
Many metrics can be used to evaluate models, some I calculate here are:

Sometimes, in the realm of health sciences, we want to punish or reward the model for doing something really well compared to other. For example, in cancer prediction, more emphasis is implaced on avoiding False Negatives (not detecting the cancer), so that we may wish to assign costs/weights (negative means reward) to TP, FP, TN, and FN like this:

This can be implemented as other alternative to above metrics during cross validation. Or, cost matrix can be used to classify one particular record. That is, we can use cost matrix to evaluate risk
With a RandomForest, I am able to extract the probability of a sample as showing SAS or not, then say:
P(SAS) = 0.2 P(neutral, other) = 0.8
Given the above cost matrix, then when I:
load_train = np.load('./data/train.npz', allow_pickle=True)
load_test = np.load('./data/test_large.npz', allow_pickle=True)
X_train, y_train = load_train['X_train'], load_train['y_train']
X_test, y_test = load_test['X_test_large'], load_test['y_test_large']
samples, rows, cols = X_train.shape
print("Before flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
y_train = y_train.reshape(y_train.shape[0], )
y_test = y_test.reshape(y_test.shape[0], )
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print("After flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
Before flattening: (6500, 460, 12) (6500, 1) (6499, 460, 12) (6499, 1) After flattening: (6500, 5520) (6500,) (6499, 5520) (6499,)
# Prepare for K fold cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
k = 10
# again, random_state for sake of reproducibility
kf = KFold(n_splits=k, random_state = 42, shuffle=True)
# default confusion matrix
# we can calculate TP, FP, FN, TN from this matrix at the end
matrix = pd.DataFrame(0, index=['True Yes', 'True No', ''],
columns=['Pred Yes', 'Pred No', ''])
start = time.time()
for train_index, test_index in kf.split(X_train):
# identify the train and test set within each fold
X_fold_train, X_fold_test = X_train[train_index], X_train[test_index]
y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
# fit the model on the training set
model = RandomForestClassifier(n_estimators=500, oob_score=True,
min_samples_leaf = 10, max_depth = 3,
min_samples_split = 100, max_leaf_nodes = 50,
max_features = 'sqrt', n_jobs=8)
model.fit(X_fold_train, y_fold_train)
# predict label on validation test set, record results
y_pred = model.predict(X_fold_test)
y_prob = model.predict_proba(X_fold_test)[:,1]
# collect metrics
m = pd.crosstab(y_pred, y_fold_test, margins=True)
m.index = ['True Yes', 'True No', '']
m.columns = ['Pred Yes', 'Pred No', '']
matrix += m
end = time.time()
print('Time taken:', end - start)
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
recall = TP / (TP + FN)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t", recall)
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average Recall: \t\t\t\t\t\t", recall)
print("Average F Measures: \t\t\t\t\t\t",
(2*precision*recall) / (precision+recall))
pred_prob = model.predict_proba(X_train)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_train, pred_prob))
Time taken: 70.1884503364563
Pred Yes Pred No
True Yes 1847 168 2015
True No 598 3887 4485
2445 4055 6500
Accuracy: 0.8821538461538462
Error Rate: 0.11784615384615385
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.9166253101736973
True Negative Rate (TNR) | Specificity 0.8666666666666667
False Positive Rate (FPR) is 0.13333333333333333
False Negative Rate (FNR) is 0.08337468982630274
Average Precision: 0.7554192229038855
Average Recall: 0.9166253101736973
Average F Measures: 0.8282511210762332
Average AUC: 0.9728629100381009
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# n_estimators = number of trees to bag
# Fit the model on all training samples possible
# clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None,
# random_state=None, verbose=0, warm_start=False, class_weight=None,
# ccp_alpha=0.0, max_samples=None)
# Tune: min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf.
model = RandomForestClassifier(n_estimators=500, oob_score=True,
min_samples_leaf = 10, max_depth = 5,
min_samples_split = 100, max_leaf_nodes = 50,
max_features = 'sqrt', n_jobs=8)
start = time.time()
model.fit(X_train, y_train)
end = time.time()
print('Time taken:', end - start)
# save the model
joblib.dump(model, "./data/random_forest_10000.joblib")
Time taken: 9.850638628005981
['./data/random_forest_10000.joblib']
features_index = ["pi_x", "pi_y", "Pi_tot", "Fst", "Dxy", "Da", "Tajima's D on X",
"Tajima's D on Y", "Tajima's D across all samples", "SNP rel. dens. on X",
"SNP rel. dens. on Y", "SNP rel. dens. across all"]
for f, imp in zip(features_index, model.feature_importances_):
print(f'Feature {f} importance: {imp} \t\t\t\t\t\t')
features = model.feature_importances_.reshape(rows, cols).mean(axis=0)
feature_imp = pd.Series(features,index=features_index).sort_values(
ascending=False)
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.title("Important Features In Detecting SAS")
plt.show()
Feature pi_x importance: 0.0004193850576844676 Feature pi_y importance: 0.003616057749569257 Feature Pi_tot importance: 0.0003159326373812968 Feature Fst importance: 0.00047221471773658733 Feature Dxy importance: 0.00040848282711428624 Feature Da importance: 0.00019950735377009126 Feature Tajima's D on X importance: 0.00013689890022630029 Feature Tajima's D on Y importance: 0.0009553261387962288 Feature Tajima's D across all samples importance: 0.0005034417509281785 Feature SNP rel. dens. on X importance: 0.008656748862335133 Feature SNP rel. dens. on Y importance: 0.02002057889170379 Feature SNP rel. dens. across all importance: 0.010717915953457462
# Extract single tree
estimator = model.estimators_[203]
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = features_index * rows,
class_names = ["SAS", "Neutral"],
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
Below are peformance measures on the test set never touched during model building
y_pred = model.predict(X_test)
print("Accuracy on Test Set:", accuracy_score(y_test,y_pred))
print(f'Our OOB prediction of accuracy is: {model.oob_score_ * 100}%')
# show stats
matrix = pd.crosstab(y_pred, y_test, margins=True)
matrix.index = ['True Yes', 'True No', '']
matrix.columns = ['Pred Yes', 'Pred No', '']
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t",
TP / (TP + FN))
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average F Measures: \t\t\t\t\t\t",
(2*recall*precision) / (recall + precision))
pred_prob = model.predict_proba(X_test)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_test, pred_prob))
Accuracy on Test Set: 0.49515310047699646
Our OOB prediction of accuracy is: 90.03076923076922%
Pred Yes Pred No
True Yes 1095 1076 2171
True No 2205 2123 4328
3300 3199 6499
Accuracy: 0.49515310047699646
Error Rate: 0.5048468995230035
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.5043758636573008
True Negative Rate (TNR) | Specificity 0.4905268022181146
False Positive Rate (FPR) is 0.5094731977818854
False Negative Rate (FNR) is 0.4956241363426992
Average Precision: 0.33181818181818185
Average F Measures: 0.40029245110583084
Average AUC: 0.49705632441956293
from sklearn.metrics import precision_recall_curve
# precision recall curve for training set
y_prob = model.predict_proba(X_test)[:,1]
precisions, recalls, thresholds = precision_recall_curve(
y_test, y_prob, pos_label = "Yes")
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
plt.plot(precisions, recalls, "b--", label="Precision")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="upper left")
plt.show()
# ROC Curve
# The more the area under the curve the better our classifier
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label = "Yes")
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr, "Random Forest")
plt.show()
from sklearn.metrics import roc_auc_score
print("The AUC is ", roc_auc_score(y_test, y_prob))
The AUC is 0.49705632441956293
Next steps:
Generating a diversity of inputs:
Transfer of knowledge:
Build the same model using alt_output (means)
Remaining issues: